1 Setup

This section and the next are relevant for reproducibility purposes. For results, please skip to section 3 (Quality Control)

  • Libraries
suppressPackageStartupMessages({
  library(data.table)
  library(DESeq2)
  library(gplots)
  library(here)
  library(hyperSpec)
  library(magrittr)
  library(parallel)
  library(pheatmap)
  library(plotly)
  library(pvclust)
  library(tidyverse)
  library(tximport)
  library(vsn)
  library(emoji)
})
  • Helper functions
source(here("UPSCb-common/src/R/featureSelection.R"))
  • Graphics
hpal <- colorRampPalette(c("blue","white","red"))(100)

2 Data

  • Sample information
samples <- read_csv(here("data/drought_roots.csv"),
                      col_types=cols(.default=col_factor()))
  • tx2gene translation table
tx2gene <- suppressMessages(read_delim(here("reference/annotation/Picab02_tx2gene.tsv.gz"), delim="\t", col_names=c("TXID","GENE"), skip=1))
  • Raw data
filelist <- list.files(here("results/Salmon"), 
                          recursive = TRUE, 
                          pattern = "quant.sf",
                          full.names = TRUE)
  • Sanity check to ensure that the data is sorted according to the sample info
stopifnot(all(match(sub("[1-3]_151118_BC852HANXX_P2503_", "", sub("*_sortmerna_trimmomatic","",basename(dirname(filelist)))),
                    samples$SciLifeID) == 1:nrow(samples)))

samples_rep <-rbind(samples,samples)
samples_rep <- rbind(samples_rep,samples)
  • add filelist to samples as a new column
samples_rep %<>% mutate(Filenames = filelist)
samples_rep$BioID <- sub("[1-3]_151118_BC852HANXX_", "", sub("*_sortmerna_trimmomatic","",basename(dirname(samples_rep$Filenames))))
  • Read the expression at the gene level
txi <- suppressMessages(tximport(files = samples_rep$Filenames,
                                 type = "salmon",
                                 tx2gene=tx2gene))
counts <- txi$counts

counts <- do.call(
  cbind,
  lapply(split.data.frame(t(counts),
                          samples$SciLifeID),
         colSums))

csamples <- samples[,-1]
csamples <- csamples[match(colnames(counts),csamples$SciLifeID),]
colnames(counts) <- csamples$SciLifeID
counts <- as.data.frame(counts)
  • Keep only the expressed genes in both conditions
expr_genes <- read.table(here("data/analysis/common_genes.txt"))
colnames(expr_genes) <- "Genes"
counts <- filter(counts, rownames(counts) %in% expr_genes$Genes)

 

3 Quality Control

3.1 “Not expressed” genes

sel <- rowSums(counts) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
        round(sum(sel) * 100/ nrow(counts),digits=1),
        sum(sel),
        nrow(counts))
## [1] "0% percent (0) of 31943 genes are not expressed"

3.2 Sequencing depth

  • Let us take a look at the sequencing depth, colouring by Level
dat <- tibble(x=colnames(counts),y=colSums(counts)) %>% 
  bind_cols(csamples)

ggplot(dat,aes(x,y,fill=Level)) + 
  geom_col() + 
  scale_y_continuous(name="reads") +
  facet_grid(~ factor(Level), scales = "free") +
  theme_bw() + 
  theme(axis.text.x=element_text(angle=90,size=4), axis.title.x=element_blank()) +
  labs(title ="Sample sequencing depth")

👉 There is not a big difference in the raw sequencing depth

3.3 Per-gene mean expression

i.e. the mean raw count of every gene across samples is calculated and displayed on a log10 scale.

ggplot(data.frame(value=log10(rowMeans(counts))),aes(x=value)) + 
  geom_density(na.rm = TRUE) +
  ggtitle("Gene mean raw counts distribution") +
  scale_x_continuous(name="mean raw counts (log10)") + 
  theme_bw()

👉 The per-gene mean expression is as expected since the highest peak is around 2

3.4 Per-sample expression

dat <- as.data.frame(log10(counts)) %>% 
  utils::stack() %>% 
  mutate(Level=samples_rep$Level[match(ind,csamples$SciLifeID)]) 


ggplot(dat,aes(x=values,group=ind,col=Level)) + 
  geom_density(na.rm = TRUE) + 
  ggtitle("Sample raw counts distribution") +
  scale_x_continuous(name="per gene raw counts (log10)") + 
  theme_bw()

👉 All samples have the same sequencing depth characteristics and there is no deviation when we look at one or the other condition

  • Export raw expression data
write.csv(counts,file=here("data/analysis/salmon/raw-unormalised-gene-expression_data_merge.csv"))

 

4 Data normalisation

4.1 Preparation

For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample condition and replicate.

load(file=here("data/analysis/salmon/dds_merge_expr_genes.rda"))

4.2 Size factors

(i.e. the sequencing library size effect)

dds <- estimateSizeFactors(dds) %>% 
  suppressMessages()

boxplot of the sequencing libraries size factor:

boxplot(normalizationFactors(dds),
        main="Sequencing libraries size factor",
        las=2,log="y")
abline(h=1, col = "Red", lty = 3)

and without outliers:

boxplot(normalizationFactors(dds),
        main="Sequencing libraries size factor",
        las=2,log="y", outline=FALSE)
abline(h=1, col = "Red", lty = 3)

👉 The sequencing libraries size factor is a bit variable across samples

4.3 Validation

Let’s look at standard deviations before and after VST normalization. This plot is to see whether there is a dependency of SD on the mean.

Before:

meanSdPlot(log1p(counts(dds))[rowSums(counts(dds))>0,])

After VST normalization, the red line should be almost horizontal which indicates no dependency of variance on mean (homoscedastic).

4.4 Variance Stabilising Transformation

vsd <- varianceStabilizingTransformation(dds, blind=TRUE)
vst <- assay(vsd)
vst <- vst - min(vst)

meanSdPlot(vst[rowSums(vst)>0,])

👉 We can conclude that the variance stabilization worked adequately even if the red line is not perfectly horizontal


 

4.5 QC on the normalised data

4.5.1 PCA

pc <- prcomp(t(vst))
percent <- round(summary(pc)$importance[2,]*100)

4.5.2 Scree plot

We define the number of variables of the model (=1) and the number of possible combinations (=8).

We plot the percentage explained by different components

  • the red line represents number of variables in the model
  • the orange line represents number of variable combinations.
nvar=1
nlevel=nlevels(dds$Level)
ggplot(tibble(x=1:length(percent),y=cumsum(percent)),aes(x=x,y=y)) +
  geom_line() + scale_y_continuous("variance explained (%)",limits=c(0,100)) +
  scale_x_continuous("Principal component") + 
  geom_vline(xintercept=nvar,colour="red",linetype="dashed",size=0.5) + 
  geom_hline(yintercept=cumsum(percent)[nvar],colour="red",linetype="dashed",size=0.5) +
  geom_vline(xintercept=nlevel,colour="orange",linetype="dashed",size=0.5) + 
  geom_hline(yintercept=cumsum(percent)[nlevel],colour="orange",linetype="dashed",size=0.5)+
  ggtitle("Percentage of variance explained by different components")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.

4.5.3 PCA plot

dds$Filenames <- sub("*_151118_BC852HANXX_P2503_", "_", sub("*_sortmerna_trimmomatic","",basename(dirname(dds$Filenames))))
pc.dat <- bind_cols(PC1=pc$x[,1],
                    PC2=pc$x[,2],
                    as.data.frame(colData(dds)))

p <- ggplot(pc.dat,aes(x=PC1,y=PC2,col=Level,shape=Level,text=Filenames)) + 
  geom_point(size=2) + 
  ggtitle("Principal Component Analysis",subtitle="variance stabilized counts")

ggplotly(p) %>% 
  layout(xaxis=list(title=paste("PC1 (",percent[1],"%)",sep="")),
         yaxis=list(title=paste("PC2 (",percent[2],"%)",sep="")))
## Warning: The shape palette can deal with a maximum of 6 discrete values because
## more than 6 becomes difficult to discriminate; you have 8. Consider
## specifying shapes manually if you must have them.

👉 Some conditions clusterized better than others in the PCA plot

4.6 Sequencing depth

Number of genes expressed per condition at different cutoffs:

conds <- factor(dds$Level)
dev.null <- rangeSamplesSummary(counts=vst,
                                conditions=conds,
                                nrep=3)

4.7 Heatmap

Filter for noise

sels <- rangeFeatureSelect(counts=vst, 
                           conditions=conds,
                           nrep=3) %>% 
  suppressWarnings()

vst.cutoff <- 2
abline(h=10000, col="Red", lty=3)

  • Set the cut off to 7 in order to retrieve less than 10 000 genes
vst.cutoff <- 7

hm_reduced <- heatmap.2(t(scale(t(vst[sels[[vst.cutoff+1]],]))),
                distfun=pearson.dist,
                hclustfun=function(X){hclust(X,method="ward.D2")},
                labRow = NA,trace = "none",
                labCol = conds,
                col=hpal)

plot(as.hclust(hm_reduced$colDendrogram),xlab="",sub="")

  • Using pheatmap
mat <- t(scale(t(vst[sels[[vst.cutoff+1]],])))
hm <- pheatmap(mat,
               color = hpal,
               border_color = NA,
               clustering_distance_rows = "correlation",
               clustering_method = "ward.D2",
               show_rownames = F,
               labels_col = conds,
               angle_col = 90,
               legend = F)

👉 The different conditions are not so different in gene expression level except the extreme conditions

4.8 Clustering of samples

Done to assess the previous dendrogram’s reproducibility

hm.pvclust <- pvclust(data = t(scale(t(vst[sels[[vst.cutoff+1]],]))),
                       method.hclust = "ward.D2", 
                       nboot = 1000, parallel = TRUE)
## Creating a temporary cluster...done:
## socket cluster with 15 nodes on host 'localhost'
## Multiscale bootstrap... Done.

Plot the clustering with bp and au

plot(hm.pvclust, labels = conds, cex.axis=1.5)
pvrect(hm.pvclust)

👉 Some conditions clusterize better than others

bootstrapping results as a table
print(hm.pvclust, digits=3)
## 
## Cluster method: ward.D2
## Distance      : correlation
## 
## Estimates on edges:
## 
##       si    au    bp se.si se.au se.bp      v      c  pchi
## 1  0.886 0.942 0.946 0.022 0.014 0.002 -1.593 -0.019 0.311
## 2  0.841 0.940 0.823 0.021 0.010 0.004 -1.241  0.316 0.860
## 3  1.000 1.000 1.000 0.000 0.000 0.000  0.000  0.000 0.000
## 4  0.939 0.959 0.996 0.040 0.030 0.001 -2.188 -0.445 0.160
## 5  0.939 0.959 0.996 0.040 0.030 0.001 -2.188 -0.445 0.160
## 6  1.000 1.000 1.000 0.000 0.000 0.000  0.000  0.000 0.000
## 7  0.934 0.961 0.986 0.023 0.016 0.001 -1.988 -0.222 0.990
## 8  0.926 0.955 0.987 0.026 0.018 0.001 -1.955 -0.257 0.994
## 9  1.000 1.000 1.000 0.000 0.000 0.000  0.000  0.000 0.000
## 10 0.154 0.567 0.590 0.036 0.031 0.005 -0.198 -0.030 0.206
## 11 0.972 0.987 0.977 0.010 0.005 0.002 -2.114  0.123 0.792
## 12 1.000 1.000 1.000 0.000 0.000 0.000  0.000  0.000 0.000
## 13 1.000 1.000 1.000 0.000 0.000 0.000  0.000  0.000 0.000
## 14 0.928 0.955 0.991 0.028 0.020 0.001 -2.023 -0.328 0.827
## 15 0.437 0.737 0.681 0.038 0.026 0.005 -0.552  0.082 0.594
## 16 0.244 0.678 0.536 0.040 0.028 0.005 -0.276  0.186 0.797
## 17 0.068 0.583 0.478 0.037 0.030 0.005 -0.077  0.132 0.089
## 18 0.444 0.838 0.455 0.042 0.019 0.005 -0.437  0.550 0.562
## 19 0.085 0.559 0.522 0.036 0.030 0.005 -0.103  0.047 0.289
## 20 0.367 0.823 0.404 0.045 0.021 0.005 -0.342  0.586 0.646
## 21 1.000 1.000 1.000 0.000 0.000 0.000  0.000  0.000 0.000
## 22 0.397 0.839 0.400 0.045 0.019 0.005 -0.368  0.621 0.558
## 23 0.686 0.844 0.839 0.033 0.021 0.004 -1.002  0.011 0.131
## 24 0.686 0.844 0.839 0.033 0.021 0.004 -1.002  0.011 0.131
## 25 0.686 0.844 0.839 0.033 0.021 0.004 -1.002  0.011 0.131
## 26 1.000 1.000 1.000 0.000 0.000 0.000  0.000  0.000 0.000
## 27 1.000 1.000 1.000 0.000 0.000 0.000  0.000  0.000 0.000

 

5 Summary

The data are quite good so we can continue with our analysis

6 Session Info

Session Info
sessionInfo()
## R version 4.2.2 (2022-10-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.6 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
##  [1] parallel  grid      stats4    stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] emoji_15.0                  vsn_3.64.0                 
##  [3] tximport_1.24.0             forcats_1.0.0              
##  [5] stringr_1.5.0               dplyr_1.1.0                
##  [7] purrr_1.0.1                 readr_2.1.3                
##  [9] tidyr_1.3.0                 tibble_3.1.8               
## [11] tidyverse_1.3.2             pvclust_2.2-0              
## [13] plotly_4.10.1               pheatmap_1.0.12            
## [15] magrittr_2.0.3              hyperSpec_0.100.0          
## [17] xml2_1.3.3                  ggplot2_3.4.1              
## [19] lattice_0.20-45             here_1.0.1                 
## [21] gplots_3.1.3                DESeq2_1.36.0              
## [23] SummarizedExperiment_1.28.0 Biobase_2.58.0             
## [25] MatrixGenerics_1.10.0       matrixStats_0.63.0         
## [27] GenomicRanges_1.50.1        GenomeInfoDb_1.34.4        
## [29] IRanges_2.32.0              S4Vectors_0.36.1           
## [31] BiocGenerics_0.44.0         data.table_1.14.6          
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.4.1           backports_1.4.1        lazyeval_0.2.2        
##   [4] splines_4.2.0          crosstalk_1.2.0        BiocParallel_1.32.4   
##   [7] digest_0.6.31          htmltools_0.5.4        fansi_1.0.4           
##  [10] memoise_2.0.1          googlesheets4_1.0.1    tzdb_0.3.0            
##  [13] limma_3.52.4           Biostrings_2.66.0      annotate_1.74.0       
##  [16] modelr_0.1.10          vroom_1.6.1            timechange_0.2.0      
##  [19] jpeg_0.1-10            colorspace_2.1-0       blob_1.2.3            
##  [22] rvest_1.0.3            haven_2.5.1            xfun_0.36             
##  [25] hexbin_1.28.2          crayon_1.5.2           RCurl_1.98-1.10       
##  [28] jsonlite_1.8.4         genefilter_1.78.0      survival_3.5-0        
##  [31] glue_1.6.2             gtable_0.3.1           gargle_1.3.0          
##  [34] zlibbioc_1.44.0        XVector_0.38.0         DelayedArray_0.24.0   
##  [37] scales_1.2.1           DBI_1.1.3              Rcpp_1.0.10           
##  [40] viridisLite_0.4.1      xtable_1.8-4           bit_4.0.5             
##  [43] preprocessCore_1.58.0  htmlwidgets_1.6.1      httr_1.4.4            
##  [46] RColorBrewer_1.1-3     ellipsis_0.3.2         farver_2.1.1          
##  [49] pkgconfig_2.0.3        XML_3.99-0.13          sass_0.4.5            
##  [52] dbplyr_2.3.0           deldir_1.0-6           locfit_1.5-9.7        
##  [55] utf8_1.2.3             labeling_0.4.2         tidyselect_1.2.0      
##  [58] rlang_1.0.6            AnnotationDbi_1.60.0   munsell_0.5.0         
##  [61] cellranger_1.1.0       tools_4.2.0            cachem_1.0.6          
##  [64] cli_3.6.0              generics_0.1.3         RSQLite_2.2.20        
##  [67] broom_1.0.3            evaluate_0.20          fastmap_1.1.0         
##  [70] yaml_2.3.7             knitr_1.42             bit64_4.0.5           
##  [73] fs_1.6.0               caTools_1.18.2         KEGGREST_1.38.0       
##  [76] brio_1.1.3             compiler_4.2.0         rstudioapi_0.14       
##  [79] png_0.1-8              testthat_3.1.6         affyio_1.66.0         
##  [82] reprex_2.0.2           geneplotter_1.74.0     bslib_0.4.2           
##  [85] stringi_1.7.12         highr_0.10             Matrix_1.5-3          
##  [88] vctrs_0.5.2            pillar_1.8.1           lifecycle_1.0.3       
##  [91] BiocManager_1.30.19    jquerylib_0.1.4        bitops_1.0-7          
##  [94] R6_2.5.1               latticeExtra_0.6-30    affy_1.74.0           
##  [97] KernSmooth_2.23-20     codetools_0.2-18       gtools_3.9.4          
## [100] assertthat_0.2.1       rprojroot_2.0.3        withr_2.5.0           
## [103] GenomeInfoDbData_1.2.9 hms_1.1.2              rmarkdown_2.20        
## [106] googledrive_2.0.0      lubridate_1.9.1        interp_1.1-3